library(tidyverse) 
library(skimr)
library(codebook)
stroke <- read_csv("healthcare_dataset_stroke_data.csv")
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   id = col_double(),
##   gender = col_character(),
##   age = col_double(),
##   hypertension = col_double(),
##   heart_disease = col_double(),
##   ever_married = col_character(),
##   work_type = col_character(),
##   Residence_type = col_character(),
##   avg_glucose_level = col_double(),
##   bmi = col_character(),
##   smoking_status = col_character(),
##   stroke = col_double()
## )
save(stroke,file="./stroke.RData")

Stroke Dataset Codebook

codebook(stroke)
## No missing values.

Metadata

Description

Dataset name: stroke

The dataset has N=5110 rows and 12 columns. 5110 rows have no missing values on any column.

Metadata for search engines
  • Date published: 2021-04-29
x
id
gender
age
hypertension
heart_disease
ever_married
work_type
Residence_type
avg_glucose_level
bmi
smoking_status
stroke

#Variables

id

Distribution

Distribution of values for id

Distribution of values for id

0 missing values.

Summary statistics

name data_type n_missing complete_rate min median max mean sd hist label
id numeric 0 1 67 36932 72940 36517.83 21161.72 ▇▇▇▇▇ NA

gender

Distribution

Distribution of values for gender

Distribution of values for gender

0 missing values.

Summary statistics

name data_type n_missing complete_rate n_unique empty min max whitespace label
gender character 0 1 3 0 4 6 0 NA

age

Distribution

Distribution of values for age

Distribution of values for age

0 missing values.

Summary statistics

name data_type n_missing complete_rate min median max mean sd hist label
age numeric 0 1 0.08 45 82 43.22661 22.61265 ▅▆▇▇▆ NA

hypertension

Distribution

Distribution of values for hypertension

Distribution of values for hypertension

0 missing values.

Summary statistics

name data_type n_missing complete_rate min median max mean sd hist label
hypertension numeric 0 1 0 0 1 0.097456 0.2966067 ▇▁▁▁▁ NA

heart_disease

Distribution

Distribution of values for heart_disease

Distribution of values for heart_disease

0 missing values.

Summary statistics

name data_type n_missing complete_rate min median max mean sd hist label
heart_disease numeric 0 1 0 0 1 0.0540117 0.226063 ▇▁▁▁▁ NA

ever_married

Distribution

Distribution of values for ever_married

Distribution of values for ever_married

0 missing values.

Summary statistics

name data_type n_missing complete_rate n_unique empty min max whitespace label
ever_married character 0 1 2 0 2 3 0 NA

work_type

Distribution

Distribution of values for work_type

Distribution of values for work_type

0 missing values.

Summary statistics

name data_type n_missing complete_rate n_unique empty min max whitespace label
work_type character 0 1 5 0 7 13 0 NA

Residence_type

Distribution

Distribution of values for Residence_type

Distribution of values for Residence_type

0 missing values.

Summary statistics

name data_type n_missing complete_rate n_unique empty min max whitespace label
Residence_type character 0 1 2 0 5 5 0 NA

avg_glucose_level

Distribution

Distribution of values for avg_glucose_level

Distribution of values for avg_glucose_level

0 missing values.

Summary statistics

name data_type n_missing complete_rate min median max mean sd hist label
avg_glucose_level numeric 0 1 55 92 272 106.1477 45.28356 ▇▃▁▁▁ NA

bmi

Distribution

Distribution of values for bmi

Distribution of values for bmi

0 missing values.

Summary statistics

name data_type n_missing complete_rate n_unique empty min max whitespace label
bmi character 0 1 419 0 2 4 0 NA

smoking_status

Distribution

Distribution of values for smoking_status

Distribution of values for smoking_status

0 missing values.

Summary statistics

name data_type n_missing complete_rate n_unique empty min max whitespace label
smoking_status character 0 1 4 0 6 15 0 NA

stroke

Distribution

Distribution of values for stroke

Distribution of values for stroke

0 missing values.

Summary statistics

name data_type n_missing complete_rate min median max mean sd hist label
stroke numeric 0 1 0 0 1 0.048728 0.2153199 ▇▁▁▁▁ NA

Missingness report

Codebook table

JSON-LD metadata

The following JSON-LD can be found by search engines, if you share this codebook publicly on the web.

{
  "name": "stroke",
  "datePublished": "2021-04-29",
  "description": "The dataset has N=5110 rows and 12 columns.\n5110 rows have no missing values on any column.\n\n\n## Table of variables\nThis table contains variable names, labels, and number of missing values.\nSee the complete codebook for more.\n\n|name              |label | n_missing|\n|:-----------------|:-----|---------:|\n|id                |NA    |         0|\n|gender            |NA    |         0|\n|age               |NA    |         0|\n|hypertension      |NA    |         0|\n|heart_disease     |NA    |         0|\n|ever_married      |NA    |         0|\n|work_type         |NA    |         0|\n|Residence_type    |NA    |         0|\n|avg_glucose_level |NA    |         0|\n|bmi               |NA    |         0|\n|smoking_status    |NA    |         0|\n|stroke            |NA    |         0|\n\n### Note\nThis dataset was automatically described using the [codebook R package](https://rubenarslan.github.io/codebook/) (version 0.9.2).",
  "keywords": ["id", "gender", "age", "hypertension", "heart_disease", "ever_married", "work_type", "Residence_type", "avg_glucose_level", "bmi", "smoking_status", "stroke"],
  "@context": "http://schema.org/",
  "@type": "Dataset",
  "variableMeasured": [
    {
      "name": "id",
      "@type": "propertyValue"
    },
    {
      "name": "gender",
      "@type": "propertyValue"
    },
    {
      "name": "age",
      "@type": "propertyValue"
    },
    {
      "name": "hypertension",
      "@type": "propertyValue"
    },
    {
      "name": "heart_disease",
      "@type": "propertyValue"
    },
    {
      "name": "ever_married",
      "@type": "propertyValue"
    },
    {
      "name": "work_type",
      "@type": "propertyValue"
    },
    {
      "name": "Residence_type",
      "@type": "propertyValue"
    },
    {
      "name": "avg_glucose_level",
      "@type": "propertyValue"
    },
    {
      "name": "bmi",
      "@type": "propertyValue"
    },
    {
      "name": "smoking_status",
      "@type": "propertyValue"
    },
    {
      "name": "stroke",
      "@type": "propertyValue"
    }
  ]
}`

Skim Stroke Dataset Summary

skim(stroke)
Data summary
Name stroke
Number of rows 5110
Number of columns 12
_______________________
Column type frequency:
character 6
numeric 6
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
gender 0 1 4 6 0 3 0
ever_married 0 1 2 3 0 2 0
work_type 0 1 7 13 0 5 0
Residence_type 0 1 5 5 0 2 0
bmi 0 1 2 4 0 419 0
smoking_status 0 1 6 15 0 4 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
id 0 1 36517.83 21161.72 67.00 17741.25 36932.00 54682.00 72940.00 ▇▇▇▇▇
age 0 1 43.23 22.61 0.08 25.00 45.00 61.00 82.00 ▅▆▇▇▆
hypertension 0 1 0.10 0.30 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
heart_disease 0 1 0.05 0.23 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
avg_glucose_level 0 1 106.15 45.28 55.12 77.24 91.88 114.09 271.74 ▇▃▁▁▁
stroke 0 1 0.05 0.22 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁